1 Setup

This section and the next are relevant for reproducibility purposes. For results, please skip to section 3 (Quality Control)

  • Libraries
suppressPackageStartupMessages({
  library(data.table)
  library(DESeq2)
  library(emoji)
  library(gplots)
  library(gtools)
  library(here)
  library(hyperSpec)
  library(limma)
  library(magrittr)
  library(parallel)
  library(patchwork)
  library(PCAtools)
  library(pheatmap)
  library(plotly)
  library(pvclust)
  library(RColorBrewer)
  library(tidyverse)
  library(tximport)
  library(vsn)
})
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
  • Helper functions
source(here("UPSCb-common/src/R/featureSelection.R"))
  • Graphics
hpal <- colorRampPalette(c("blue","white","red"))(100)
pal <- brewer.pal(9,"Blues")

2 Data

  • Sample information
samples <- read_tsv(here("doc/samples.tsv"),
                    col_types=cols(.default=col_factor()))

Read the expression at the gene level

load(here("data/tximport.rda"))
counts <- txi$counts
colnames(counts) <- samples$SampleID

 

3 Quality Control

  • “Not expressed” genes
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "20% percent (24586) of 122964 genes are not expressed"

3.1 Sequencing depth

  • Let us take a look at the sequencing depth, colouring by Stage and splitting by Tissue
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>% 
  bind_cols(samples)

ggplot(dat,aes(x,y,fill=Stage)) + 
  geom_col() + 
  scale_y_continuous(name="reads") +
  facet_grid(~ factor(Tissue), scales = "free") +
  theme_bw() + 
  theme(axis.text.x=element_text(angle=90,size=4),
        axis.title.x=element_blank())

👉 We observe +/- 20% difference in the raw sequencing depth

3.2 per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) + 
  geom_density(na.rm = TRUE) +
  ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)") + 
  theme_bw()

👉 The cumulative gene coverage is as biased towards low level expression.

3.3 Per-sample expression

dat <- as.data.frame(log10(counts)) %>% 
  utils::stack() %>% 
  mutate(Tissue=samples$Tissue[match(ind,samples$SampleID)]) %>% 
  mutate(Stage=samples$Stage[match(ind,samples$SampleID)])

ggplot(dat,aes(x=values,group=ind,col=Stage)) + 
  geom_density(na.rm = TRUE) + 
  ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)") + 
  theme_bw() +
  facet_grid(~ factor(Tissue), scales = "free")

ggplot(dat,aes(x=values,group=ind,col=Tissue)) + 
  geom_density(na.rm = TRUE) + 
  ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)") + 
  theme_bw()

👉 All samples have the same sequencing depth characteristics and there is a slight deviation when we look at the tissue type. The FMG has more medium expressed genes and less highly expressed ones tham the ZE.

  • Export raw expression data
dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data.csv"))

 

4 Data normalisation

4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.

dds <- DESeqDataSetFromTximport(
  txi=txi,
  colData = samples,
  design = ~ Stage * Tissue)
## using counts and average transcript lengths from tximport
colnames(dds) <- samples$SampleID

save(dds,file=here("data/analysis/salmon/dds.rda"))

4.2 size factors

(i.e. the sequencing library size effect)

dds <- estimateSizeFactors(dds) %>% 
  suppressMessages()

boxplot(normalizationFactors(dds),
        main="Sequencing libraries size factor",
        las=2,log="y")
abline(h=1, col = "Red", lty = 3)

and without outliers:

boxplot(normalizationFactors(dds),
        main="Sequencing libraries size factor",
        las=2,log="y", outline=FALSE)
abline(h=1, col = "Red", lty = 3)

👉 There is some slight differences in the libraries’ size factors. They are all within +/- 60% of the average library size. Some library also show a larger spread around the median, which indicates that genes tend to be quite variable in their expression.

Assess whether there might be a difference in library size linked to a given metadata

boxplot(split(t(normalizationFactors(dds)),dds$Stage),las=2,
        main="Sequencing libraries size factor by Tissue",
        outline=FALSE,notch=TRUE)

boxplot(split(t(normalizationFactors(dds)),dds$Tissue),las=2,
        main="Sequencing libraries size factor by Tissue",
        outline=FALSE,notch=TRUE)

👉 The scaling factor distribution is dependent on both variables. This is potentially a caveat for the DE comparison between tissues.

plot(colMeans(normalizationFactors(dds)),
     log10(colSums(counts(dds))),ylab="log10 raw depth",
     xlab="average scaling factor",
     col=rainbow(n=nlevels(dds$Tissue))[as.integer(dds$Tissue)],
     pch=c(17,19)[as.integer(dds$Stage)])
legend("bottomright",fill=rainbow(n=nlevels(dds$Tissue)),
       legend=levels(dds$Tissue),cex=0.6)

👉 The scaling factor appear linearly proportional to the sequencing depth, but is clearly partitioned by tissue.

4.3 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)

4.4 Validation

let’s look at standard deviations before and after VST normalization. This plot is to see whether there is a dependency of SD on the mean.

Before:

meanSdPlot(log1p(counts(dds))[rowSums(counts(dds))>0,])

After:

meanSdPlot(vst[rowSums(vst)>0,])

After VST normalization, the red line is almost horizontal which indicates no dependency of variance on mean (homoscedastic).

👉 We can conclude that the variance stabilization provides only minor gain. This is something to keep in mind in the rest of the analysis. It is most likely due to the difference between tissues and the amount of lowly expressed genes.


 

4.5 QC on the normalised data

4.5.1 PCA

pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)

Using PCAtools

p <- pca(vst,colData(dds))

4.5.2 Scree plot

We define the number of variable of the model:

vars <- all.vars(design(dds))
nvar <- length(vars)
nlevel<-reduce(sapply(vars,function(v){nlevels(eval(parse(text=paste0("dds$",v))))}),`*`)

We devise the optimal number of components using two methods

horn <- suppressWarnings(parallelPCA(vst))
elbow <- findElbowPoint(p$variance)

We plot the percentage explained by different components and try to empirically assess whether the observed number of components would be in agreement with our model’s assumptions.

  • the red line represents number of variables in the model
  • the orange line represents number of variable combinations.
  • the black dotted, annotate lines represent the optimal number of components reported by the horn and elbow methods.
ggplot(tibble(x=1:length(percent),y=cumsum(percent),p=percent),aes(x=x,y=y)) +
  geom_line() + geom_col(aes(x,p)) + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component",breaks=1:length(percent),minor_breaks=NULL) + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",linewidth=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",linewidth=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",linewidth=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",linewidth=0.5) +
  geom_vline(xintercept=c(horn$n,elbow),colour="black",linetype="dotted",linewidth=0.5) +
  geom_label(aes(x = horn$n + 1, y = cumsum(percent)[horn$n],label = 'Horn', vjust = 1)) +
  geom_label(aes(x = elbow + 1, y = cumsum(percent)[elbow],label = 'Elbow', vjust = 1))

👉 The first component explains 29% of the data variance. Both metrics, Horn and Elbow suggest that two or three components (the dark doted lines) are those that are informative. Indeed the slope of the curve is fairly linear past PC3/PC4 and that would indicate that the remaining PCs only capture sample specific noise. While this is only empirical, the scree plot support having only few variables of importance in the dataset.

4.5.3 PCA plot

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    PC3=pc$x[,3],
                    as.data.frame(colData(dds)))
  • PC1 vs PC2
p1 <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Tissue,shape=Stage,text=SampleID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p1) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep=""))) %>% suppressWarnings()

The same as a biplot

biplot(p,
       colby = 'Tissue',
       colLegendTitle = 'Tissue',
       encircle = TRUE,
       encircleFill = TRUE,
       legendPosition = 'top', 
       legendLabSize = 16, legendIconSize = 8.0)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2

👉 The replicates cluster together, the first dimension separates the Tissue while the second one separates the Satges

  • PC1 vs PC3
p2 <- ggplot(pc.dat,aes(x=PC1,y=PC3,col=Tissue,shape=Stage,text=SampleID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p2) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC3 (",percent[3],"%)",sep=""))) %>% suppressWarnings()

The same as a biplot

p$metadata$Condition <- paste(samples$Tissue,samples$Stage)
biplot(p,x = 'PC1', y = 'PC3',
       colby = 'Condition',
       colLegendTitle = 'Condition',
       encircle = TRUE,
       encircleFill = TRUE,
       legendPosition = 'top', 
       legendLabSize = 16, legendIconSize = 8.0)

👉 The third dimension is interesting as it separates probably some interaction between Tissue and Stage

subplot(style(p1, showlegend = FALSE), p2,
        titleX = TRUE, titleY = TRUE, nrows = 1, margin = c(0.05, 0.05, 0, 0))

4.5.4 Pairs plot

This allows for looking at more dimensions, five by default

suppressMessages(pairsplot(p,colby='Tissue',shape='Stage'))

👉 While PC 1 to 3 can be linked to the study design, it looks more like PC4 and 5 are linked to individual samples.

4.5.5 Loadings

Loadings, i.e. the individual effect of every gene in a component can be studied. Here the most important ones are visualized throughout the different PCs

plotloadings(p,
             rangeRetain = 0.01,
             labSize = 4.0,
             title = 'Loadings plot',
             subtitle = 'PC1 to PC5',
             caption = 'Top 1% variables',
             drawConnectors = TRUE)
## -- variables retained:
## TRINITY_GG_187825_c0_g1, TRINITY_GG_236061_c0_g1, TRINITY_GG_250836_c0_g1, TRINITY_GG_41618_c1_g2, TRINITY_GG_129318_c0_g1, TRINITY_GG_84060_c0_g1, TRINITY_GG_250070_c0_g1, TRINITY_GG_32529_c0_g1, TRINITY_GG_73965_c0_g1, TRINITY_GG_331010_c0_g2, TRINITY_GG_28255_c0_g2, TRINITY_GG_188596_c0_g2, TRINITY_GG_154577_c0_g1, TRINITY_GG_312913_c0_g1
## Warning: ggrepel: 43 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

👉 These are genes that have a strong influence in the different components. Probably interesting candidates to look at, in a non-DE fashion, although they are likely to be DE, given the first three PCs are linked to our design.

4.5.6 Correlation

This is a plot showing the correlation between the PC and the model variables. Note that while this might be relevant for a linear variable, it is less so for categorical variables. Sorting categorical variables in a linear order according to the PCs above might help.

suppressWarnings(eigencorplot(p,metavars=c('Tissue','Stage')))

👉 Keep in mind that the two variables are categorical, so they have been assigned an integer value based on that. They are not truly continuous variables. But clearly as we could see PC1 is strongly linked to Tissue and PC2 to Stage.

4.5.7 Samples Distance

sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- colnames(sampleDistMatrix) <- paste(dds$Tissue,dds$Stage)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=pal)

👉 The sample distance clusters samples as expected

4.6 Sequencing depth

The figures show the number of genes expressed per condition at different expression cutoffs. The scale on the lower plot is the same as on the upper. The first plot is a heatmap showing the number of genes above a given cutoff. The second plot shows it as a ratio of the number of genes expressed for (a) given variable(s) divided by the average number of genes expressed at that cutoff across all variable(s). The latter plot is of course biased at higher cutoff as the number of genes becomes smaller and smaller. The point of these two plots is to assert whether the number of genes expressed varies between conditions, as this would break some assumptions for normalisation and differential expression.

conds <- factor(paste(dds$Tissue,dds$Stage))
dev.null <- rangeSamplesSummary(counts=vst,
                                conditions=conds,
                                nrep=3)

👉 As expected from the raw data QC, the FMG samples have more genes expressed at a higher value than the ZE. This is most likely indicative of a bias in the number of genes expressed with the FMG expressing less genes overall. The library size correction would then “wrongly” inflate the expression of the genes for the FMG. This is because we are breaking the assumption from DESeq2 that there are the same number of genes expressed across samples.

Plotting the number of genes that are expressed (at any level)

do.call(rbind,split(t(nrow(vst) - colSums(vst==0)),samples$Tissue)) %>% as.data.frame() %>% 
  rownames_to_column("Tissue") %>% pivot_longer(starts_with("V")) %>% 
  ggplot(aes(x=Tissue, y=value, fill=Tissue)) + geom_dotplot(binaxis = "y", stackdir = "center")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

👉 As suspected, the FMG has about 10000 less genes expressed than the ZE. This is probably due to the low expressed genes (but not necessarily). Here it would be good to do some stronger filtering on the transcripts we are using for salmon.

4.7 Heatmap

Here we want to visualise all the informative genes as a heatmap. We first filter the genes to remove those below the selected noise/signal cutoff. The method employed here is naive, and relies on observing a sharp decrease in the number of genes within the first few low level of expression. Using an independent filtering method, such as implemented in DESeq2 would be more accurate, but for the purpose of QA validation, a naive approach is sufficient. Note that a sweet spot for computation is around 20 thousand genes, as building the hierarchical clustering for the heatmap scales non-linearly.

sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3) %>% 
  suppressWarnings()

👉 Here a cutoff of 5 is applied (reason is 20000 genes can still be classified by a hierarchical clustering in a decent amount of time)

vst.cutoff <- 5

nn <- nrow(vst[sels[[vst.cutoff+1]],])
tn <- nrow(vst)
pn <- round(nn * 100/ tn, digits=1)

⚠️ 14.7% (18092) of total 122964 genes are plotted below:

mat <- t(scale(t(vst[sels[[vst.cutoff+1]],])))
hm <- pheatmap(mat,
               color = hpal,
               border_color = NA,
               clustering_distance_rows = "correlation",
               clustering_method = "ward.D2",
               show_rownames = FALSE,
               labels_col = conds,
               angle_col = 90,
               legend = FALSE)

👉 The samples cluster as expected, note though that now the stage has a stronger importance than the tissue in the grouping!

4.8 Clustering of samples

Below we assess the previous dendrogram’s reproducibility and plot the clusters with au and bp where:

  • au (Approximately Unbiased): computed by multiscale bootstrap resampling 👈 a better approximation
  • bp (Bootstrap Probability): computed by normal bootstrap resampling

⚠️Clusters with AU larger than 95% are highlighted by rectangles, which are strongly supported by data

hm.pvclust <- pvclust(data = t(scale(t(vst[sels[[vst.cutoff+1]],]))),
                       method.hclust = "ward.D2", 
                       nboot = 30, parallel = TRUE)
## Creating a temporary cluster...done:
## socket cluster with 15 nodes on host 'localhost'
## Multiscale bootstrap... Done.
plot(hm.pvclust, labels = conds)
pvrect(hm.pvclust)

👉 The classification here is similar and place the stage together


 

5 Summary

The data is of good quality

The replicates group together

The PCA first three components are relevant biologically

There is good evidence that there will be DE genes as a result of the selected design

The FMG has an average 10000 less genes expressed than the ZE, affecting the library size correction and hence the possible DE results.

The aforementioned limitation could be addressed by trying to 1. selecting the transcripts more aggressively so as to remove lowly expressed ones - or putative transposable elements or 2. modelling the expression difference and correcting the library size factor correspondingly

A third alternative is to run the DE but imposes more stringent cutoffs on the log2 fold changes or to use the independent filtering to help remove uninformative transcripts from the analysis, prior to rerunning the QA and DE.

6 Session Info

Session Info
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
##  [1] parallel  grid      stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vsn_3.70.0                  tximport_1.30.0            
##  [3] lubridate_1.9.3             forcats_1.0.0              
##  [5] stringr_1.5.1               dplyr_1.1.4                
##  [7] purrr_1.0.2                 readr_2.1.4                
##  [9] tidyr_1.3.0                 tibble_3.2.1               
## [11] tidyverse_2.0.0             RColorBrewer_1.1-3         
## [13] pvclust_2.2-0               plotly_4.10.3              
## [15] pheatmap_1.0.12             PCAtools_2.14.0            
## [17] ggrepel_0.9.4               patchwork_1.1.3            
## [19] magrittr_2.0.3              limma_3.58.1               
## [21] hyperSpec_0.100.0           xml2_1.3.5                 
## [23] ggplot2_3.4.4               lattice_0.21-8             
## [25] here_1.0.1                  gtools_3.9.5               
## [27] gplots_3.1.3                emoji_15.0                 
## [29] DESeq2_1.42.0               SummarizedExperiment_1.32.0
## [31] Biobase_2.62.0              MatrixGenerics_1.14.0      
## [33] matrixStats_1.1.0           GenomicRanges_1.54.1       
## [35] GenomeInfoDb_1.38.1         IRanges_2.36.0             
## [37] S4Vectors_0.40.2            BiocGenerics_0.48.1        
## [39] data.table_1.14.8          
## 
## loaded via a namespace (and not attached):
##  [1] rstudioapi_0.15.0         jsonlite_1.8.7           
##  [3] farver_2.1.1              rmarkdown_2.25           
##  [5] zlibbioc_1.48.0           vctrs_0.6.4              
##  [7] DelayedMatrixStats_1.24.0 RCurl_1.98-1.13          
##  [9] htmltools_0.5.7           S4Arrays_1.2.0           
## [11] SparseArray_1.2.2         proj4_1.0-13             
## [13] sass_0.4.7                KernSmooth_2.23-22       
## [15] bslib_0.6.1               htmlwidgets_1.6.3        
## [17] plyr_1.8.9                testthat_3.2.0           
## [19] cachem_1.0.8              ggalt_0.4.0              
## [21] lifecycle_1.0.4           pkgconfig_2.0.3          
## [23] rsvd_1.0.5                Matrix_1.6-0             
## [25] R6_2.5.1                  fastmap_1.1.1            
## [27] GenomeInfoDbData_1.2.11   digest_0.6.33            
## [29] colorspace_2.1-0          rprojroot_2.0.4          
## [31] dqrng_0.3.1               irlba_2.3.5.1            
## [33] crosstalk_1.2.1           beachmat_2.18.0          
## [35] labeling_0.4.3            fansi_1.0.5              
## [37] timechange_0.2.0          httr_1.4.7               
## [39] abind_1.4-5               compiler_4.3.1           
## [41] bit64_4.0.5               withr_2.5.2              
## [43] BiocParallel_1.36.0       hexbin_1.28.3            
## [45] highr_0.10                Rttf2pt1_1.3.12          
## [47] maps_3.4.1.1              MASS_7.3-60              
## [49] DelayedArray_0.28.0       caTools_1.18.2           
## [51] tools_4.3.1               extrafontdb_1.0          
## [53] glue_1.6.2                reshape2_1.4.4           
## [55] generics_0.1.3            gtable_0.3.4             
## [57] tzdb_0.4.0                preprocessCore_1.64.0    
## [59] hms_1.1.3                 BiocSingular_1.18.0      
## [61] ScaledMatrix_1.10.0       utf8_1.2.4               
## [63] XVector_0.42.0            pillar_1.9.0             
## [65] vroom_1.6.4               bit_4.0.5                
## [67] deldir_2.0-2              tidyselect_1.2.0         
## [69] locfit_1.5-9.8            knitr_1.45               
## [71] xfun_0.41                 statmod_1.5.0            
## [73] brio_1.1.3                stringi_1.8.2            
## [75] lazyeval_0.2.2            yaml_2.3.7               
## [77] evaluate_0.23             codetools_0.2-19         
## [79] interp_1.1-5              extrafont_0.19           
## [81] BiocManager_1.30.22       cli_3.6.1                
## [83] affyio_1.72.0             ash_1.0-15               
## [85] munsell_0.5.0             jquerylib_0.1.4          
## [87] Rcpp_1.0.11               png_0.1-8                
## [89] ellipsis_0.3.2            latticeExtra_0.6-30      
## [91] jpeg_0.1-10               sparseMatrixStats_1.14.0 
## [93] bitops_1.0-7              viridisLite_0.4.2        
## [95] scales_1.3.0              affy_1.80.0              
## [97] crayon_1.5.2              rlang_1.1.2              
## [99] cowplot_1.1.1

 

 

drawing

Created by YOURNAME

YOUREMAIL